{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import json\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/qa/squad/ms-train-2.0.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/qa/squad/ms-train-1.1.json\n",
    "# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json\n",
    "# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json\n",
    "# !wget https://adversarialqa.github.io/data/aqa_v1.0.zip\n",
    "# !unzip aqa_v1.0.zip\n",
    "# !rm aqa_v1.0.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/qa/squad/ms-dev-2.0.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip\n",
    "# !unzip MLQA_V1.zip\n",
    "# !rm MLQA_V1.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MLQA_V1/test/test-context-en-question-en.json\n",
      "MLQA_V1/dev/dev-context-en-question-en.json\n",
      "xquad.en.json\n",
      "combined/train.json\n",
      "combined/dev.json\n",
      "train-v1.1.json\n",
      "train-v2.0.json\n",
      "ms-train-1.1.json\n",
      "ms-train-2.0.json\n"
     ]
    }
   ],
   "source": [
    "with open('train-flan.json', 'w') as fopen_jsonl:\n",
    "    \n",
    "    files = ['MLQA_V1/test/test-context-en-question-en.json',\n",
    "             'MLQA_V1/dev/dev-context-en-question-en.json',\n",
    "             'xquad.en.json',\n",
    "             'combined/train.json',\n",
    "             'combined/dev.json',\n",
    "             'train-v1.1.json',\n",
    "             'train-v2.0.json',\n",
    "             'ms-train-1.1.json',\n",
    "             'ms-train-2.0.json']\n",
    "    \n",
    "    for f in files:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            data = json.load(fopen)\n",
    "\n",
    "        for i in range(len(data['data'])):\n",
    "            for p in data['data'][i]['paragraphs']:\n",
    "                text = p['context']\n",
    "                if len(text.split()) > 600:\n",
    "                    continue\n",
    "                for q in p['qas']:\n",
    "                    qs = q['question']\n",
    "                    is_impossible = q.get('is_impossible', False)\n",
    "                    if is_impossible:\n",
    "                        a = 'no answer'\n",
    "                    else:\n",
    "                        a = q['answers'][0]['text']\n",
    "                    \n",
    "                    if not len(a):\n",
    "                        a = 'no answer'\n",
    "                    \n",
    "                    src = f'context: {text} question: {qs}'\n",
    "                    d = {\"translation\": {\"src\": src, \"tgt\": a, 'prefix': 'read the following context and answer the question given: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dev-v2.0.json\n",
      "ms-dev-2.0.json\n"
     ]
    }
   ],
   "source": [
    "with open('test-flan.json', 'w') as fopen_jsonl:\n",
    "    \n",
    "    files = ['dev-v2.0.json',\n",
    "             'ms-dev-2.0.json']\n",
    "    \n",
    "    for f in files:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            data = json.load(fopen)\n",
    "\n",
    "        for i in range(len(data['data'])):\n",
    "            for p in data['data'][i]['paragraphs']:\n",
    "                text = p['context']\n",
    "                if len(text.split()) > 600:\n",
    "                    continue\n",
    "                for q in p['qas']:\n",
    "                    qs = q['question']\n",
    "                    is_impossible = q.get('is_impossible', False)\n",
    "                    if is_impossible:\n",
    "                        a = 'no answer'\n",
    "                    else:\n",
    "                        a = q['answers'][0]['text']\n",
    "                    \n",
    "                    if not len(a):\n",
    "                        a = 'no answer'\n",
    "                    \n",
    "                    src = f'context: {text} question: {qs}'\n",
    "                    d = {\"translation\": {\"src\": src, \"tgt\": a, 'prefix': 'read the following context and answer the question given: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train-flan.json > shuffled-train-flan.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf test-flan.json > shuffled-test-flan.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"context: On September 9, 2007, West suggested that his race had something to do with his being overlooked for opening the 2007 MTV Video Music Awards (VMAs) in favor of Britney Spears; he claimed, \\\"Maybe my skin\\u2019s not right.\\\" West was performing at the event; that night, he lost all five awards that he was nominated for, including Best Male Artist and Video of the Year. After the show, he was visibly upset that he had lost at the VMAs two years in a row, stating that he would not come back to MTV ever again. He also appeared on several radio stations saying that when he made the song \\\"Stronger\\\" that it was his dream to open the VMAs with it. He has also stated that Spears has not had a hit in a long period of time and that MTV exploited her for ratings. question: With what song did Kanye aspire to open the VMAs with?\", \"tgt\": \"Stronger\", \"prefix\": \"read the following context and answer the question given: \"}}\r\n",
      "{\"translation\": {\"src\": \"context: Critical and commercial reception to PS3 improved over time, after a series of price revisions, Blu-ray's victory over HD DVD, and the release of several well received titles. Ars Technica's original launch review gave PS3 only a 6/10, but second review of the console in June 2008 rated it a 9/10. In September 2009, IGN named PlayStation 3 the 15th best gaming console of all time, behind both of its competitors: Wii (10th) and Xbox 360 (6th). However, PS3 has won IGN's \\\"Console Showdown\\\"\\u2014based on which console offers the best selection of games released during each year\\u2014in three of the four years since it began (2008, 2009 and 2011, with Xbox winning in 2010). IGN judged PlayStation 3 to have the best game line-up of 2008, based on their review scores in comparison to those of Wii and Xbox 360. In a comparison piece by PC mag's Will Greenwald in June 2012, PS3 was selected as an overall better console compared to Xbox 360. Pocket-lint said of the console \\\"The PS3 has always been a brilliant games console,\\\" and that \\\"For now, this is just about the best media device for the money.\\\" question: PS3 is in the same market as?\", \"tgt\": \"Wii (10th) and Xbox 360\", \"prefix\": \"read the following context and answer the question given: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 2 shuffled-train-flan.json"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
